{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Events聚类\n",
    "\n",
    "熟悉各中聚类算法的调用\n",
    "并用评价指标选择合适的超参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入必要的工具包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics\n",
    "\n",
    "from sklearn.decomposition import PCA\n",
    "import time\n",
    "from collections import defaultdict\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第一步：抽取出只在训练集和测试集中出现的event"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of uniqueUsers :3391\n",
      "number of uniqueEvents :13418\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "我们只关心train和test中出现的user和event，因此重点处理这部分关联数据\n",
    "\n",
    "train.csv 有6列：\n",
    "user：用户ID\n",
    "event：活动ID\n",
    "invited：是否被邀请（0/1）\n",
    "timestamp：ISO-8601 UTC格式时间字符串，表示用户看到该活动的时间\n",
    "interested, and not_interested\n",
    "\n",
    "Test.csv 除了没有interested, and not_interested，其余列与train相同\n",
    " \"\"\"\n",
    "    \n",
    "# 统计训练集中有多少不同的用户的events\n",
    "uniqueUsers = set()\n",
    "uniqueEvents = set()\n",
    "\n",
    "#倒排表\n",
    "#统计每个用户参加的活动   / 每个活动参加的用户\n",
    "eventsForUser = defaultdict(set)\n",
    "usersForEvent = defaultdict(set)\n",
    "    \n",
    "for filename in [\"train.csv\", \"test.csv\"]:\n",
    "    f = open(filename, 'rb')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "#     f.readline().strip().split(\",\")\n",
    "    f.readline().decode('utf8').strip().split(\",\")\n",
    "    \n",
    "    for line in f:    #对每条记录\n",
    "        cols = line.decode('utf8').strip().split(\",\")\n",
    "        uniqueUsers.add(cols[0])   #第一列为用户ID\n",
    "        uniqueEvents.add(cols[1])   #第二列为活动ID\n",
    "        \n",
    "        #eventsForUser[cols[0]].add(cols[1])    #该用户参加了这个活动\n",
    "        #usersForEvent[cols[1]].add(cols[0])    #该活动被用户参加\n",
    "    f.close()\n",
    "\n",
    "\n",
    "n_uniqueUsers = len(uniqueUsers)\n",
    "n_uniqueEvents = len(uniqueEvents)\n",
    "\n",
    "print(\"number of uniqueUsers :%d\" % n_uniqueUsers)\n",
    "print(\"number of uniqueEvents :%d\" % n_uniqueEvents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count= 13418\n"
     ]
    }
   ],
   "source": [
    "import csv\n",
    "with open('events.csv') as f_events:\n",
    "    f_csv = csv.reader(f_events)\n",
    "    headers = next(f_csv)\n",
    "    count = 0\n",
    "    simplified_events_list = []\n",
    "    for row in f_csv:\n",
    "        # Process row\n",
    "#         print('row[0]=', row[0])\n",
    "        if row[0] in uniqueEvents: # 判断值是否在set集合中的速度很快, 因为查找set用到了hash，时间在O(1)级别。\n",
    "            simplified_events_list.append(row)\n",
    "            count += 1 ;\n",
    "with open('simplified_events.csv','w') as f_simplified_events:\n",
    "    f_simplified_csv = csv.writer(f_simplified_events)\n",
    "    f_simplified_csv.writerow(headers)\n",
    "    f_simplified_csv.writerows(simplified_events_list)\n",
    "    print('count=', count)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取抽取出的13418条event数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>c_1</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 110 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     event_id     user_id                start_time city state  zip country  \\\n",
       "0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN  NaN     NaN   \n",
       "\n",
       "   lat  lng  c_1   ...     c_92  c_93  c_94  c_95  c_96  c_97  c_98  c_99  \\\n",
       "0  NaN  NaN    2   ...        0     1     0     0     0     0     0     0   \n",
       "1  NaN  NaN    2   ...        0     0     0     0     0     0     0     0   \n",
       "2  NaN  NaN    0   ...        0     0     0     0     0     0     0     0   \n",
       "3  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "4  NaN  NaN    1   ...        0     0     0     0     0     0     0     0   \n",
       "\n",
       "   c_100  c_other  \n",
       "0      0        9  \n",
       "1      0        7  \n",
       "2      0       12  \n",
       "3      0        8  \n",
       "4      0        9  \n",
       "\n",
       "[5 rows x 110 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "活动描述信息在events.csv文件：共110维特征\n",
    "前9列：event_id, user_id, start_time, city, state, zip, country, lat, and lng.\n",
    "event_id：id of the event, \n",
    "user_id：id of the user who created the event.  \n",
    "city, state, zip, and country： more details about the location of the venue (if known).\n",
    "lat and lng： floats（latitude and longitude coordinates of the venue）\n",
    "start_time： 字符串，ISO-8601 UTC time，表示活动开始时间\n",
    "\n",
    "后101列为词频：count_1, count_2, ..., count_100，count_other\n",
    "count_N：活动描述出现第N个词的次数\n",
    "count_other：除了最常用的100个词之外的其余词出现的次数\n",
    " \"\"\"\n",
    "\n",
    "#读取数据\n",
    "events = pd.read_csv(\"simplified_events.csv\")\n",
    "events.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 13418 entries, 0 to 13417\n",
      "Columns: 110 entries, event_id to c_other\n",
      "dtypes: float64(2), int64(103), object(5)\n",
      "memory usage: 11.3+ MB\n"
     ]
    }
   ],
   "source": [
    "events.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "event_id          0\n",
       "user_id           0\n",
       "start_time        0\n",
       "city           7092\n",
       "state          8868\n",
       "zip           12004\n",
       "country        7065\n",
       "lat            5356\n",
       "lng            5356\n",
       "c_1               0\n",
       "c_2               0\n",
       "c_3               0\n",
       "c_4               0\n",
       "c_5               0\n",
       "c_6               0\n",
       "c_7               0\n",
       "c_8               0\n",
       "c_9               0\n",
       "c_10              0\n",
       "c_11              0\n",
       "c_12              0\n",
       "c_13              0\n",
       "c_14              0\n",
       "c_15              0\n",
       "c_16              0\n",
       "c_17              0\n",
       "c_18              0\n",
       "c_19              0\n",
       "c_20              0\n",
       "c_21              0\n",
       "              ...  \n",
       "c_72              0\n",
       "c_73              0\n",
       "c_74              0\n",
       "c_75              0\n",
       "c_76              0\n",
       "c_77              0\n",
       "c_78              0\n",
       "c_79              0\n",
       "c_80              0\n",
       "c_81              0\n",
       "c_82              0\n",
       "c_83              0\n",
       "c_84              0\n",
       "c_85              0\n",
       "c_86              0\n",
       "c_87              0\n",
       "c_88              0\n",
       "c_89              0\n",
       "c_90              0\n",
       "c_91              0\n",
       "c_92              0\n",
       "c_93              0\n",
       "c_94              0\n",
       "c_95              0\n",
       "c_96              0\n",
       "c_97              0\n",
       "c_98              0\n",
       "c_99              0\n",
       "c_100             0\n",
       "c_other           0\n",
       "Length: 110, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "### 查看是否有空值\n",
    "events.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "后101列关键词的词频没有空值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c_1</th>\n",
       "      <th>c_2</th>\n",
       "      <th>c_3</th>\n",
       "      <th>c_4</th>\n",
       "      <th>c_5</th>\n",
       "      <th>c_6</th>\n",
       "      <th>c_7</th>\n",
       "      <th>c_8</th>\n",
       "      <th>c_9</th>\n",
       "      <th>c_10</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   c_1  c_2  c_3  c_4  c_5  c_6  c_7  c_8  c_9  c_10   ...     c_92  c_93  \\\n",
       "0    2    0    2    0    0    0    0    0    0     0   ...        0     1   \n",
       "1    2    0    2    0    0    0    0    0    0     0   ...        0     0   \n",
       "2    0    0    0    0    0    0    0    0    0     0   ...        0     0   \n",
       "3    1    0    2    1    0    0    0    0    0     0   ...        0     0   \n",
       "4    1    1    0    0    0    0    0    2    0     0   ...        0     0   \n",
       "\n",
       "   c_94  c_95  c_96  c_97  c_98  c_99  c_100  c_other  \n",
       "0     0     0     0     0     0     0      0        9  \n",
       "1     0     0     0     0     0     0      0        7  \n",
       "2     0     0     0     0     0     0      0       12  \n",
       "3     0     0     0     0     0     0      0        8  \n",
       "4     0     0     0     0     0     0      0        9  \n",
       "\n",
       "[5 rows x 101 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取训练数据\n",
    "train = pd.read_csv(\"simplified_events.csv\")\n",
    "\n",
    "X_train = train.drop(columns=['event_id', 'user_id', 'start_time', 'city', 'state', 'zip', 'country', 'lat', 'lng'],axis=1)\n",
    "X_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(13418, 101)\n"
     ]
    }
   ],
   "source": [
    "print(X_train.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第二步：聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 数据标准化\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# 初始化特征的标准化器\n",
    "ss_X = StandardScaler()\n",
    "\n",
    "# 对训练数据的特征进行标准化处理\n",
    "X_train = ss_X.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 一个参数点（聚类数据为K）的模型\n",
    "def K_cluster_analysis(K, X_train):\n",
    "    start = time.time()\n",
    "    \n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    mb_kmeans.fit(X_train)\n",
    "    \n",
    "    # 在训练集和测试集上测试\n",
    "    #y_train_pred = mb_kmeans.fit_predict(X_train)\n",
    "#     y_val_pred = mb_kmeans.predict(X_val)\n",
    "    \n",
    "    #以前两维特征打印训练数据的分类结果\n",
    "    #plt.scatter(X_train[:, 0], X_train[:, 1], c=y_pred)\n",
    "    #plt.show()\n",
    "\n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(X_train,mb_kmeans.predict(X_train))\n",
    "    \n",
    "    #也可以在校验集上评估K\n",
    "#     v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    end = time.time()\n",
    "    print(\"CH_score: {}, time elaps:{}\".format(CH_score, int(end-start)))\n",
    "#     print(\"v_score: {}\".format(v_score))\n",
    "    \n",
    "#     return CH_score,v_score\n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第三步：CH_scores计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 10\n",
      "CH_score: 0.011124370134721982, time elaps:9\n",
      "K-means begin with clusters: 20\n",
      "CH_score: -0.08455643082745105, time elaps:9\n",
      "K-means begin with clusters: 30\n",
      "CH_score: -0.06671164629924597, time elaps:8\n",
      "K-means begin with clusters: 40\n",
      "CH_score: -0.08535364453904831, time elaps:9\n",
      "K-means begin with clusters: 50\n",
      "CH_score: -0.14905233000305848, time elaps:8\n",
      "K-means begin with clusters: 60\n",
      "CH_score: -0.09353846943359116, time elaps:8\n",
      "K-means begin with clusters: 70\n",
      "CH_score: -0.07274189494824121, time elaps:9\n",
      "K-means begin with clusters: 80\n",
      "CH_score: -0.08682236827603933, time elaps:8\n",
      "K-means begin with clusters: 90\n",
      "CH_score: -0.10522853611192089, time elaps:8\n",
      "K-means begin with clusters: 100\n",
      "CH_score: -0.053821019881258696, time elaps:8\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, X_train)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "CH_score结果不是太理想，尝试着调小K的范围"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 2\n",
      "CH_score: 0.466207995612021, time elaps:10\n",
      "K-means begin with clusters: 3\n",
      "CH_score: 0.4238349232120738, time elaps:10\n",
      "K-means begin with clusters: 4\n",
      "CH_score: 0.21584331953106534, time elaps:10\n",
      "K-means begin with clusters: 5\n",
      "CH_score: 0.17462366991791103, time elaps:10\n",
      "K-means begin with clusters: 6\n",
      "CH_score: 0.28261211397256514, time elaps:10\n",
      "K-means begin with clusters: 7\n",
      "CH_score: 0.19354678810412532, time elaps:10\n",
      "K-means begin with clusters: 8\n",
      "CH_score: 0.19527968824938455, time elaps:10\n",
      "K-means begin with clusters: 9\n",
      "CH_score: 0.0726351517244242, time elaps:10\n"
     ]
    }
   ],
   "source": [
    "# 缩小超参数（聚类数目K）搜索范围\n",
    "Ks = [2, 3, 4, 5, 6, 7, 8, 9]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, X_train)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从运行结果可以看出当K=2时，CH_score最大，K-means聚类效果最好"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第四步：结果显示/分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x6332f16160>]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xuc1HW9x/HXGxAVvKGs5RFwUckkK4EJ9VgcjVJQA00rvCSZhsB6NDmZl+pY2PF6vFQPNEnNSgkvZZGVRnmvo7EgaIjERdTV0C1NU8wV+Jw/vrO5wrI7wMz+5vJ+Ph7z2Jnf/H4zn+Xynt98v9/f96uIwMzMakO3rAswM7Ou49A3M6shDn0zsxri0DczqyEOfTOzGuLQNzOrIQ59M7Ma4tA3M6shDn0zsxrSI+sC1tW3b9+or6/Pugwzs4oyd+7cv0ZEXWf7lV3o19fX09jYmHUZZmYVRdLThezn5h0zsxri0DczqyEOfTOzGuLQNzOrIQ59M7Ma4tA3M6shDn0zsxpSNaEfAWedBfPnZ12JmVn5qprQX7oUvvc9GDIExoyBOXOyrsjMrPxUTegPGgQrVsAFF8Dvfw/Dh8Po0fCHP2RdmZlZ+aia0AfYYQf46ldT+F98McydCwceCCNHwv33Z12dmVn2qir0W227LZx9Njz1FFxxBTzxBBx0EIwYAbNnp/Z/M7NaVJWh36p3bzjzTFi+HL7znfQhcMghcMAB8KtfOfzNrPYUFPqSRklaLGmppHM62O8YSSEpl39cL+kNSfPzt+8Wq/CNsfXWcNppqbP3u9+FlSvh8MPhQx+Cn/0M1q7Noiozs67XaehL6g5MA0YDg4FjJQ1uZ79tgdOBR9Z5allE7Ju/TSxCzZtsyy3h1FNhyRK44Qb4+9/hqKPSiJ/bbnP4m1n1K+RMfziwNCKWR0QLMBMY285+FwCXAv8sYn0lscUWcNJJ8OST8KMfQUsLfPrTsM8+MGMGrFmTdYVmZqVRSOjvCjzb5nFTftu/SBoC9I+IO9s5fqCkRyXdL+kj7b2BpAmSGiU1Njc3F1r7ZuvRA044Af70J7jlFujeHY4/HvbeG268Ed56q8tKMTPrEoWEvtrZ9q8uUEndgCuB/2pnv78AAyJiCDAFmCFpu/VeLGJ6ROQiIldX1+lqX0XXvXs601+wAH76U9hmm/RNYK+90gVfLS1dXpKZWUkUEvpNQP82j/sBz7d5vC2wD3CfpBXA/sAsSbmIeDMi/gYQEXOBZcB7ilF4KXTrltr4586FX/wC6upgwgTYc0+YNg3+WfYNV2ZmHSsk9OcAgyQNlNQTGAfMan0yIl6JiL4RUR8R9cDDwJiIaJRUl+8IRtLuwCBgedF/iyKT4Igj4OGH4e67YcCANPpn993hqqtg1aqsKzQz2zSdhn5ErAZOA+4GFgG3RsRCSVMljenk8BHAY5IWALcDEyPipc0tuqtIaVz/gw/CPffAe9+bxv0PHAiXXgqvvZZ1hWZmG0dRZlco5XK5aGxszLqMDXrooTS/z29+AzvuCFOmpG8B22+fdWVmVsskzY2IXGf7VfUVuaXw4Q+nJp+HH4Z///c01099PZx/PrxUMd9hzKxWOfQ30X77pc7euXPh4INh6tQU/uedB3/9a9bVmZm1z6G/mYYOTcM8H3sMDjssze65225pQZeVK7OuzszsnRz6RfL+98PMmbBwIXzyk2l2z4ED4Ywz4Lnnsq7OzCxx6BfZ3nunqR0WL4bjjoOrr05DPSdPhqefzro6M6t1Dv0S2XNPuP76NLnbSSfBddelbaecAsuWZV2dmdUqh36J1den6ZyXLYOJE+Gmm9L0DuPHp6mezcy6kkO/i/Tv//ZCLmeckaZy/vCHPbWDmXUth34X22UXuPxymDULXnghhb+ZWVdx6Gdk5Mg0rcO0aVlXYma1xKGfESmN6HnkkXSBl5lZV3DoZ+jEE9Pi7VdfnXUlZlYrHPoZ2n77tHLXjBnw8stZV2NmtcChn7HJk9MInu9/P+tKzKwWOPQz9oEPpKGbV18Na9dmXY2ZVTuHfhmYPDldvDV7dtaVmFm1Kyj0JY2StFjSUknndLDfMZJCUq7NtnPzxy2WdGgxiq42Rx8N73qXh2+aWel1Gvr5NW6nAaOBwcCxkga3s9+2wOnAI222DSatqfs+YBRwdeuaufa2nj3hC1+AO++EFSuyrsbMqlkhZ/rDgaURsTwiWoCZwNh29rsAuBRoO7HAWGBmRLwZEU8BS/OvZ+uYMCGN3b/22qwrMbNqVkjo7wo82+ZxU37bv0gaAvSPiDs39tj88RMkNUpqbG5uLqjwatO/P4wdm2bjfPPNrKsxs2pVSOirnW3/Wk1dUjfgSuC/NvbYf22ImB4RuYjI1dXVFVBSdZo8OS216Pl4zKxUCgn9JqB/m8f9gOfbPN4W2Ae4T9IKYH9gVr4zt7NjrY2RI9O0y75C18xKpZDQnwMMkjRQUk9Sx+ys1icj4pWI6BsR9RFRDzwMjImIxvx+4yRtKWkgMAj4Y9F/iyohwaRJ8H//B48+mnU1ZlaNOg39iFgNnAbcDSwCbo2IhZKmShrTybELgVuBJ4C7gIaIWLP5ZVev8eOhVy8P3zSz0lDEek3smcrlctHY2Jh1GZmaMCGtsPXcc9CnT9bVmFklkDQ3InKd7ecrcstQQwO88QbceGPWlZhZtXHol6EPfhAOPBCuucbz8ZhZcTn0y9TkybBkCfz2t1lXYmbVxKFfpo4+Gnbe2cM3zay4HPplasst4ZRT4Be/gGeeyboaM6sWDv0yduqp6afn4zGzYnHol7EBA+ATn4Dvfc/z8ZhZcTj0y1xDAzQ3w+23Z12JmVUDh36ZGzkSBg1yh66ZFYdDv8x165aGb/7hDzB/ftbVmFmlc+hXgPHjYeutfbZvZpvPoV8B+vSB44+Hm2+Gv/8962rMrJI59CtEQwOsWgU/+EHWlZhZJXPoV4h994UDDkhNPJ6Px8w2lUO/gjQ0wJ//DPfck3UlZlapHPoV5JhjoK7OC6yY2aYrKPQljZK0WNJSSee08/xESY9Lmi/pIUmD89vrJb2R3z5f0neL/QvUktb5eGbN8nw8ZrZpOg19Sd2BacBoYDBwbGuotzEjIt4fEfsClwJXtHluWUTsm79NLFbhtap1Pp7p07Otw8wqUyFn+sOBpRGxPCJagJnA2LY7RMSrbR72BsprDcYqsttucMQRno/HzDZNIaG/K/Bsm8dN+W3vIKlB0jLSmf7pbZ4aKOlRSfdL+shmVWtAukL3xRfhpz/NuhIzqzSFhL7a2bbemXxETIuIPYCzga/mN/8FGBARQ4ApwAxJ2633BtIESY2SGpubmwuvvkZ9/OOw557u0DWzjVdI6DcB/ds87gc838H+M4EjASLizYj4W/7+XGAZ8J51D4iI6RGRi4hcXV1dobXXrNb5eH7/e1iwIOtqzKySFBL6c4BBkgZK6gmMA2a13UHSoDYPDweW5LfX5TuCkbQ7MAhYXozCa93nPuf5eMxs43Ua+hGxGjgNuBtYBNwaEQslTZU0Jr/baZIWSppPasYZn98+AnhM0gLgdmBiRLxU9N+iBvXpA8cdBzfdBK+8knU1ZlYpFFFeA21yuVw0NjZmXUZFmDcPhg2Db30LTj+98/3NrHpJmhsRuc728xW5FWzoUNh//9TEU2af3WZWphz6FW7yZFi82PPxmFlhHPoV7lOfgr59PXzTzArj0K9wW20FJ58MP/85NDVlXY2ZlTuHfhWYODG16V97bdaVmFm5c+hXgfr6t+fjaWnJuhozK2cO/SoxeTK88ILn4zGzjjn0q8Qhh8Aee/gKXTPrmEO/SnTrBpMmwYMPwuOPZ12NmZUrh34VOemkNJrHwzfNbEMc+lVkxx3h2GM9H4+ZbZhDv8o0NMDrr8MPf5h1JWZWjhz6VWbYMBg+3PPxmFn7HPpVqKEBnnwS7r0360rMrNw49KvQpz8NO+3k4Ztmtj6HfhVqnY/nZz/zfDxm9k4Fhb6kUZIWS1oq6Zx2np8o6XFJ8yU9JGlwm+fOzR+3WNKhxSzeNmziRFi7Nk3NYGbWqtPQz69xOw0YDQwGjm0b6nkzIuL9EbEvcClwRf7YwaQ1dd8HjAKubl0z10pr4EA47DCYPt3z8ZjZ2wo50x8OLI2I5RHRAswExrbdISJebfOwN9A6bmQsMDMi3oyIp4Cl+dezLtDQACtXwh13ZF2JmZWLQkJ/V+DZNo+b8tveQVKDpGWkM/3TN+ZYK41DD4Xdd3eHrpm9rZDQVzvb1hsBHhHTImIP4GzgqxtzrKQJkholNTY3NxdQkhWidT6eBx7wfDxmlhQS+k1A/zaP+wHPd7D/TODIjTk2IqZHRC4icnV1dQWUZIVqnY/nmmuyrsTMykEhoT8HGCRpoKSepI7ZWW13kDSozcPDgSX5+7OAcZK2lDQQGAT8cfPLtkLttBOMGwc/+hG8+mrn+5tZdes09CNiNXAacDewCLg1IhZKmippTH630yQtlDQfmAKMzx+7ELgVeAK4C2iIiDUl+D2sA5Mnw2uvpeA3s9qmKLMJWnK5XDQ2NmZdRtUZPjwF/8KFoPZ6WsysokmaGxG5zvbzFbk1oqEBFi2C++/PuhIzy5JDv0Z8+tNpvn0vsGJW2xz6NWLrrdN8PHfcAc89l3U11ioCHnsMVq/OuhKrFQ79GuL5eMrPNdfABz+YVjxz8FtXcOjXkN13h9Gj03w8b72VdTX24INwxhkwaBDcfjuMHw9rPLbNSsyhX2MmT4a//CVNu2zZaWqCY45JE+P98Y9w0UUwYwacckr6NmZWKj2yLsC61qhRKWimTYNPfSrramrTP/8Jn/wkrFqVVjfbYQc455y0/RvfgC23TM0+HlprpeDQrzHdu6e2/bPPTmP23/e+rCuqLRFpPqQ5c1Kn+uA2k5Sffz68+SZcfHEK/quucvBb8bl5pwZ9/vMpVDz7ZtebNg1uvBG+9jU48sh3PifBhRfCmWfCt7+dPpjL7NpJqwIO/RrUt2+aj+eHP4R//CPramrHAw+kQD/iCPj619vfR4LLL08X0112Gfz3f3dpiVYDHPo1yvPxdK1nn00dt3vsATfdlKa93hApnemfcgp885vpZlYsDv0aNXw45HKpucFNCKX1xhtw1FGpo/ZnP4Ptt+/8mG7d4Npr4bOfTU1Bl11W+jqtNjj0a9jkyfDEE6nZwUojInWcz52bzvDf+97Cj+3WDW64AT7zGfjyl9PZv9nmcujXsHHjoE8fz8dTSt/5Tuo7+frXYcyYTndfT48eqQnuqKPShVzXXlv0Eq3GOPRr2NZbp5E8d9wBz3e0FpptkvvugylTYOzY1ESzqbbYAmbOTB3AEyfC979ftBKtBjn0a9ykSWnOF8/HU1zPPJMufhs0KJ3pd9RxW4iePeG22+CQQ9LEeTffXJw6rfYU9E9R0ihJiyUtlXROO89PkfSEpMck/U7Sbm2eWyNpfv42a91jLVt77JGu0vV8PMXT2nHb0pI6brfbrjivu9VW6VvZQQfBiSemDwGzjdVp6EvqDkwDRgODgWMlDV5nt0eBXER8ALgduLTNc29ExL752ya0alqpNTSk5p2f/zzrSipfBEyYAPPmpY7bvfYq7uv36gWzZsEBB8Bxx/nvzDZeIWf6w4GlEbE8IlqAmcDYtjtExL0RsSr/8GGgX3HLtFIaPRp2281X6BbDt76Vwn7qVPjEJ0rzHttsA7/6FQwblpqQfv3r0ryPVadCQn9X4Nk2j5vy2zbkZKDtP8OtJDVKeljSkRs6yLLTvXtq27/33jSE0zbNvffCl76Uplf4yldK+17bbQd33QXvf39qSvrtb0v7flY9Cgn99qZ8avdyHkknADmg7aUkA/KL9R4HXCVpj3aOm5D/YGhsbm4uoCQrtpNP9nw8m2PFinTW/Z73FKfjthA77AC/+U1qQhozxusfW2EK+afZBPRv87gfsN4AP0kfA74CjImIN1u3R8Tz+Z/LgfuAIeseGxHTIyIXEbm6urqN+gWsOPr2Tevoej6ejbdqVTrbXr06ddxuu23XvfdOO8Hs2VBfD4cfDn/4Q9e9t1WmQkJ/DjBI0kBJPYFxwDtG4UgaAlxLCvwX22zvI2nL/P2+wIGAGxDKVENDCvybbsq6ksoRAV/4AixYkIZRvuc9XV/DzjvD734H//ZvqX9mzpyur8EqR6ehHxGrgdOAu4FFwK0RsVDSVEmto3EuA7YBbltnaObeQKOkBcC9wMUR4dAvU8OHw9ChqYnH8/EU5sor04pXF1yQzrSzsssucM896cz/kENg/vzsarHypiiz/925XC4aGxuzLqNm3XBDat+//34YMSLrasrb736XAvbII9Mat+Ww4MmKFfAf/wGvv56uCN5nn6wrsq4iaW6+/7RDviLX3qF1Ph536HbsqafSRGjvfW9aFKUcAh9S2/4996RO+ZEj4ckns67Iyo1D396hVy846ST4yU/SAuq2vtaO2zVr0sVRXdlxW4g99kjBL8FHPwpLl2ZdkZUTh76tZ+LENBLluuuyrqT8RKTmr8cegx//GPbcM+uK2rfXXmnsfktLCv4VK7KuyMqFQ9/WM2gQHHpomsZ39eqsqykvl1+eZrz8n/9JcxaVs332ScH/2mtw8MFp9S4zh761q6EBnnvOc7u0NXt2Wqz8mGPgnPWmHSxP++6bLuB66aV0xu8mO3PoW7sOO8zz8bS1fHnquB08OM1nXy4dt4XI5dKUDStXps7dF1/s/BirXg59a1f37qlt/557YNGirKvJ1uuvp2GZEemK2222ybqijXfAAfDLX8LTT8PHPgZ/+1vWFVlWHPq2QSefnBbvuOaarCvJTkRaXWzhwtSWv8d6M0dVjhEj0rTMf/4zfPzj8PLLWVdkWXDo2wbV1aX5eH7wg9QZWIsuuwxuvRUuvDB1ble6kSPTQiwLF6aO6Fdfzboi62oOfevQ5MkpGGpxeb6774Zzz00ffF/+ctbVFM/o0WnVrXnz0v1a/UCvVQ5969D++8OQITBtWm3Nx7NsWbo6+X3vS1NTVFLHbSHGjEnXGTzySFrsZdWqzo+x6uDQtw5J6Wz/8cfhkktqIxxeey113Eqp47Z376wrKo1jjklTad9/f/p9//nPrCuyruDQt04dd1xqCz733DS3y0UXwSuvZF1VaUSkaSieeAJuuQV23z3rikrruOPSN5nZs9OHQEtL1hVZqTn0rVO9eqUrOx94II35Pu88GDAg/ay2Md+XXJJmzLz44jTCpRZ87nPw3e+mIZ2f+Qy89VbWFVkpOfStYB/5SFqQe968NJLl4ovTmf/pp8Mzz2Rd3eb79a/TB9m4cWmt21py6qnw7W+n5qwTTvD0G9XMoW8bbciQNIxx0aIUkNdck8avf/7zsHhx1tVtmqVLU1PHBz4A119ffR23hfjP/3x7iOpJJ6VZRK36FBT6kkZJWixpqaT1Zh2RNEXSE5Iek/Q7Sbu1eW68pCX52/hiFm/Z2muv1B68bFnq7J05E/beOw1xfPTRrKsr3D/+kToyu3VLY9h79cq6oux86UvwzW+mJTNPPRXWrs26Iiu6iOjwBnQHlgG7Az2BBcDgdfY5GOiVvz8JuCV/f0dgef5nn/z9Ph2937Bhw8Iq0wsvRJx3XsR220VAxOjREQ88kHVVHVu7NuLooyO6dYuYPTvrasrH176W/g4nTUp/Rlb+gMboJM8joqAz/eHA0ohYHhEtwExg7DofHPdGROtgvoeBfvn7hwKzI+KliHgZmA2U+YS0tql23jlNOfzMM+kK1sbGdOn/Rz6S2svLcZz/RRelBWMuvTTNSWPJN76RLki75hqYMqU8/+5s0xQS+rsCbWfibspv25CTgV9v4rFWBbbfPg3vXLEidQ4+/XSatXPYsHQlaLm0Ff/yl/DVr6a2/ClTsq6mvEipo/6MM+Cqq9Lfp4O/OhQS+u11abX71y/pBCAHXLYxx0qaIKlRUmNzc3MBJVkl6NUrdQ4uXZqmI3799dTe3zo9cZZjwpcsgeOPhw9+EL73vdrsuO2MBFdemWZbveQS+PrXs67IiqFHAfs0Af3bPO4HPL/uTpI+BnwF+I+IeLPNsQetc+x96x4bEdOB6QC5XM7nE1WmZ880Fvyzn00dpRdemEb6nH8+nHVWms2zKztPWztue/Rwx21npDQFR0sLTJ2aFlw/77ziv09E+gb41lvr31pa2t9eyHO9esGxx8JWWxW/5kql6OQ7m6QewJ+BkcBzwBzguIhY2GafIcDtwKiIWNJm+47AXGBoftM8YFhEvLSh98vlctHY2Lhpv41VhIg0mdmFF8KDD6bZPM88M40A2n770r732rXpytNZs9KKUh/9aGnfr1qsWZM+uG+6KYXodtsVP6BLZcKEtPRntZM0NyJyne3X6Zl+RKyWdBpwN2kkzw0RsVDSVFJv8SxSc842wG1K35OfiYgxEfGSpAtIHxQAUzsKfKsNUprWd9SoFPoXXZTOHi++OC3T+MUvpk7hUrjwwnR2f+WVDvyN0b17apLr2TP9+W2xRbr17Pn2/XVvvXt3/HxHz23u863PXXYZ/O//wuGHp0nmrIAz/a7mM/3a9OijKfRvuy01IXzhC2nM+IABxXuPO+9M//GPPz5NNOZ2/OrX0gL77QdNTWnSwHe/O+uKSqfQM31fkWtlYciQNMHZokVpNE2xr/JdvDiF/ZAhMH26A79W9OwJM2akmVNPOskjkMChb2Vmr73SNAjFvMr31VdTx21r08TWWxe3Zitve++dmnjuuit1Stc6h76VpQED4FvfSmP9zz03dfwOHZrG+z/4YOGvs3YtnHhiGqJ5223FbS6yyjF5cvq3c9ZZadrsWubQt7K2uVf5XnAB/PzncMUVcNBBXVKylSEpzRO17bapme/NNzs/plo59K0ibOgq36FDN3yV76xZ6YKiE09MF4lZbXvXu1LT4fz58LWvZV1Ndhz6VlHWvcr3jTfav8r3ySfTvPDDhqUFQtxxa5DWAz711NTGf++9WVeTDYe+VaTWq3wXLkxn+r17p5E+e+6Z5oo58sh0FaY7bm1dl18Ogwalb4Avv5x1NV3PoW8VrXv3dIXt3Lmpjb++Pl3du2xZWvawf/9OX8JqTO/ecPPNsHIlTJpUe8M4HfpWFVqv8n3gAfj979OaviNGZF2VlatcLk0ffcst6QOglviKXDOrSWvWwMEHw4IF6VZfn3VFm8dX5JqZdaB79zQdB6QZYMtlnYdSc+ibWc2qr09X6T70UJr7qRY49M2sph1/PIwbl67pmDOn090rnkPfzGqaBFdfDbvskq7teP31rCsqLYe+mdW8Pn1S+/6SJdW/XrJD38yMNDfTWWelqbdnzcq6mtIpKPQljZK0WNJSSee08/wISfMkrZZ0zDrPrZE0P3+r4j9KM6t0F1wA++6b1m1euTLrakqj09CX1B2YBowGBgPHShq8zm7PAJ8DZrTzEm9ExL75mxcsM7OyVQuLrhRypj8cWBoRyyOiBZgJjG27Q0SsiIjHgLUlqNHMrMtU+6IrhYT+rsCzbR435bcVaitJjZIelnTkRlVnZpaBal50pZDQb29S2o350jMgf2nwccBVkvZY7w2kCfkPhsbm5uaNeGkzs+Kr5kVXCgn9JqDtXIX9gOcLfYOIeD7/czlwHzCknX2mR0QuInJ1dXWFvrSZWclU66IrhYT+HGCQpIGSegLjgIJG4UjqI2nL/P2+wIFAlX1ZMrNqVY2LrnQa+hGxGjgNuBtYBNwaEQslTZU0BkDShyQ1AZ8CrpW0MH/43kCjpAXAvcDFEeHQN7OKUW2LrnhqZTOzTjQ2wgEHwNFHw49/XJ7Lb3pqZTOzIqmmRVcc+mZmBTj7bPjwh6GhAVasyLqaTefQNzMrQPfu8KMfpfuVvOiKQ9/MrEDVsOiKQ9/MbCNU+qIrDn0zs41Q6YuuOPTNzDZSJS+64tA3M9sElbroikPfzGwTVeKiKw59M7NNVImLrjj0zcw2Q6UtuuLQNzPbTJMnw+jRlbHoikPfzGwztS66ss025b/oikPfzKwI3v3uFPzlvuiKQ9/MrEgqYdEVh76ZWRGV+6IrDn0zsyLq3TvNub9yJUyaVH7DOAsKfUmjJC2WtFTSOe08P0LSPEmrJR2zznPjJS3J38YXq3Azs3LVdtGVm27Kupp36jT0JXUHpgGjgcHAsZIGr7PbM8DngBnrHLsjcD6wHzAcOF9Sn80v28ysvJXroiuFnOkPB5ZGxPKIaAFmAmPb7hARKyLiMWDtOsceCsyOiJci4mVgNjCqCHWbmZW11kVXpPJadKWQ0N8VeLbN46b8tkIUdKykCZIaJTU2NzcX+NJmZuWtHBddKST021v3vdCuiYKOjYjpEZGLiFxdXV2BL21mVv7KbdGVQkK/Cejf5nE/4PkCX39zjjUzq3jltuhKIaE/BxgkaaCknsA4oNDZo+8GDpHUJ9+Be0h+m5lZzSinRVc6Df2IWA2cRgrrRcCtEbFQ0lRJYwAkfUhSE/Ap4FpJC/PHvgRcQPrgmANMzW8zM6sp5bLoiqLMrhzI5XLR2NiYdRlmZkXX0gL77QdNTfD442m+nmKRNDcicp3t5ytyzcy6SDksuuLQNzPrQlkvuuLQNzPrYlkuuuLQNzPrYlkuuuLQNzPLQFaLrjj0zcwyksWiKw59M7MMdfWiKw59M7MMdfWiKz1K+/JmZtaZXA6++U1YtSqFvtqbqrJIHPpmZmXg7LO75n3cvGNmVkMc+mZmNcShb2ZWQxz6ZmY1xKFvZlZDHPpmZjXEoW9mVkMc+mZmNaTslkuU1Aw8vRkv0Rf4a5HKKbVKqhUqq95KqhUqq95KqhUqq97NqXW3iKjrbKeyC/3NJamxkHUiy0El1QqVVW8l1QqVVW8l1QqVVW9X1OrmHTOzGuLQNzOrIdUY+tOzLmAjVFKtUFn1VlKtUFn1VlKtUFn1lrzWqmvTNzOzDavGM30zM9uAqgh9Sf0l3StpkaSFks7IuqaOSNpK0h8lLcjX+42sa+qMpO6SHpV0Z9a1dEbSCkmPS5ovqTHrejoiaQdJt0t6Mv/v94Csa9oQSXvl/0xbb69K+mLWdW2IpDPz/7/+JOnHkrbKuqYNkXRGvs6Fpf4V99JuAAADN0lEQVQzrYrmHUm7ALtExDxJ2wJzgSMj4omMS2uXJAG9I+I1SVsADwFnRMTDGZe2QZKmADlgu4g4Iut6OiJpBZCLiLIfmy3pB8CDEXGdpJ5Ar4j4e9Z1dUZSd+A5YL+I2JzrakpC0q6k/1eDI+INSbcCv4qIG7OtbH2S9gFmAsOBFuAuYFJELCnF+1XFmX5E/CUi5uXv/wNYBOyabVUbFslr+Ydb5G9l++krqR9wOHBd1rVUE0nbASOA6wEioqUSAj9vJLCsHAO/jR7A1pJ6AL2A5zOuZ0P2Bh6OiFURsRq4HziqVG9WFaHflqR6YAjwSLaVdCzfXDIfeBGYHRHlXO9VwJeBtVkXUqAAfiNprqQJWRfTgd2BZuD7+aaz6yT1zrqoAo0Dfpx1ERsSEc8B/ws8A/wFeCUifpNtVRv0J2CEpJ0k9QIOA/qX6s2qKvQlbQP8BPhiRLyadT0diYg1EbEv0A8Ynv+KV3YkHQG8GBFzs65lIxwYEUOB0UCDpBFZF7QBPYChwDURMQR4HTgn25I6l2+GGgPclnUtGyKpDzAWGAj8G9Bb0gnZVtW+iFgEXALMJjXtLABWl+r9qib0823jPwFujoifZl1PofJf5+8DRmVcyoYcCIzJt5PPBD4q6aZsS+pYRDyf//kicAeprbQcNQFNbb7l3U76ECh3o4F5EfFC1oV04GPAUxHRHBFvAT8F/j3jmjYoIq6PiKERMQJ4CShJez5USejnO0avBxZFxBVZ19MZSXWSdsjf35r0D/TJbKtqX0ScGxH9IqKe9JX+nogoyzMmAEm985355JtKDiF9fS47EbESeFbSXvlNI4GyHHywjmMp46advGeA/SX1yufDSFJfX1mStHP+5wDgk5Twz7dHqV64ix0IfBZ4PN9ODnBeRPwqw5o6sgvwg/wIiG7ArRFR9kMhK8S7gDvS/3N6ADMi4q5sS+rQfwI355tMlgMnZVxPh/Jtzh8HTs26lo5ExCOSbgfmkZpKHqW8r8z9iaSdgLeAhoh4uVRvVBVDNs3MrDBV0bxjZmaFceibmdUQh76ZWQ1x6JuZ1RCHvplZDXHom5nVEIe+mVkNceibmdWQ/wd/2vI5zgxTDgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制K取不同值时模型的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从图形可以看出，将event聚为两类时效果最好"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 绘制聚类结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXYAAAD8CAYAAABjAo9vAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAADz9JREFUeJzt3X+MpVV9x/H3p6wGQQz+GCtl12BbiyWUah2JlqQ2LtZFEWxjE2kktJpsTLTFxoYfxcQ0sdHEaq1Rixu1mkCwihLFVmFFDWmi1NkVEFwEYq2uojtGq7Qk0pVv/5hLXdfdnXvnObPPzJn3K9nMfe49c57vA8snh3Oe89xUFZKkfvzS2AVIktoy2CWpMwa7JHXGYJekzhjsktQZg12SOjN1sCd5f5J9Se444L23JLkrye1Jrkty4uqUKUma1iwj9g8A2w56bydwelWdAdwNXN6oLknSCk0d7FV1M/CDg967sar2Tw6/CGxuWJskaQU2NezrFcA/H+7DJNuB7QDHH3/8M5/2tKc1PLUk9W/Xrl3fr6q55do1CfYkVwD7gasP16aqdgA7AObn52thYaHFqSVpw0jyn9O0GxzsSS4CzgW2lg+ekaTRDQr2JNuAS4HnVtUDbUqSJA0xy+2O1wBfAE5NsjfJK4F3AicAO5PcmuTKVapTkjSlqUfsVXXBId5+X8NaJEkNuPNUkjpjsEtSZwx2SeqMwS5JnTHYJakzBrskdcZgl6TOGOyS1BmDXZI6Y7BLUmcMdknqjMEuSZ0x2CWpMwa7JHXGYJekzhjsktQZg12SOmOwS1JnZvnO0/cn2ZfkjgPee1ySnUnumfx87OqUKUma1iwj9g8A2w567zLgpqp6KnDT5FiSNKKpg72qbgZ+cNDb5wMfnLz+IPCSRnVJklZo6Bz7L1fVfQCTn08cXpIkaYijtniaZHuShSQLi4uLR+u0krThDA327yU5CWDyc9/hGlbVjqqar6r5ubm5gaeVJB3O0GD/BHDR5PVFwMcH9idJGmiW2x2vAb4AnJpkb5JXAm8Gnp/kHuD5k2NJ0og2Tduwqi44zEdbG9UiSWrAnaeS1BmDXZI6Y7BLUmcMdknqjMEuSZ0x2CWpMwa7JHXGYJekzhjsktQZg12SOmOwS1JnDHZJ6ozBLkmdMdglqTMGuyR1xmCXpM4Y7JLUGYNdkjpjsEtSZ5oEe5K/THJnkjuSXJPk2Bb9SpJmNzjYk5wM/AUwX1WnA8cALxvaryRpZVpNxWwCHpVkE3Ac8J1G/UqSZjQ42Kvq28DfAd8E7gN+VFU3HtwuyfYkC0kWFhcXh55WknQYLaZiHgucDzwF+BXg+CQvP7hdVe2oqvmqmp+bmxt6WknSYbSYijkb+I+qWqyq/wU+Bvxug34lSSvQIti/CTw7yXFJAmwF9jToV5K0Ai3m2G8BrgV2A1+Z9LljaL+SpJXZ1KKTqnoD8IYWfUmShnHnqSR1xmCXpM4Y7JLUGYNdkjpjsEtSZwx2SeqMwS5JnTHYJakzBrskdcZgl6TOGOyS1BmDXZI6Y7BLUmcMdknqjMEuSZ0x2CWpMwa7JHXGYJekzjQJ9iQnJrk2yV1J9iR5Tot+JUmza/Kdp8A/AJ+uqpcmeSRwXKN+JUkzGhzsSR4D/B7wpwBV9SDw4NB+JUkr02Iq5leBReCfknw5yXuTHH9woyTbkywkWVhcXGxwWknSobQI9k3A7wD/WFXPAP4HuOzgRlW1o6rmq2p+bm6uwWklSYfSItj3Anur6pbJ8bUsBb0kaQSDg72qvgt8K8mpk7e2Al8d2q8kaWVa3RXz58DVkztivg78WaN+JUkzahLsVXUrMN+iL0nSMO48laTOGOyS1BmDXZI6Y7BLUmcMdknqjMEuSZ0x2CWpMwa7JHXGYJekzhjsktQZg12SOmOwS1JnDHZJ6ozBLkmdMdglqTMGuyR1xmCXpM4Y7JLUmWbBnuSYJF9O8slWfUqSZtdyxH4xsKdhf9rAbrwRnvlMOP98qBq7Gml9aRLsSTYDLwLe26I/6d3vhve8B376U7jttrGrkdaXViP2twOXAA8drkGS7UkWkiwsLi42Oq02gmTsCqT1ZXCwJzkX2FdVu47Urqp2VNV8Vc3Pzc0NPa0696pXwfbtS6F+xhljVyOtL5sa9HEWcF6SFwLHAo9JclVVvbxB39qgtm1b+iNpdoNH7FV1eVVtrqpTgJcBnzXUJWk83scuSZ1pMRXz/6rq88DnW/YpSZqNI3ZJ6ozBrqPOzUfS6jLYddS5+UhaXQa7RuXmI6k9g11HnZuPpNXV9K4YaRpuPpJWlyN2rRkuqkptGOxaM1xUldow2LUmuagqrZzBrjXDRVWpDRdPtWa4qCq14Yhd3XIxVhuVwa5uuRirjcpg14bgYqw2EoNd3XIxVhuVwa6ZjDFvPcs5D2z7ghfA7t1w/fWO2LWxGOyayRjz1rOc03l1yWDXAGOMgmc5p6N0bVSDgz3JliSfS7InyZ1JLm5RmNamMeatZzmn8+oSpAZOlCY5CTipqnYnOQHYBbykqr56uN+Zn5+vhYWFQeeVpI0mya6qml+u3eARe1XdV1W7J6/vB/YAJw/tV2uTm36kta/pHHuSU4BnALcc4rPtSRaSLCwuLrY8rY4iFyelta9ZsCd5NPBR4LVV9eODP6+qHVU1X1Xzc3NzrU6rEbk4Ka1NTYI9ySNYCvWrq+pjLfrU2uTipLT2tbgrJsD7gD1V9bbhJWkt27ZtZZt+ppmbd/5eaqPFiP0s4ELgeUlunfx5YYN+1ZFp5uadv5faGPw89qr6N8DZVk1tmpG+8/fSyrnzVEfFNHPzzt9LbQzeoLQSblCSpNkdtQ1KkqS1xWCXpM4Y7JLUGYNdkjpjsEtSZwx2rRkH7jy94QZ3oUorZbBrzThw5+mb3+wuVGmlDHatee5ClWZjsGvNOHDn6SWXuAtVWimDXcu79lrYsgU+/enDNnnTm+C44+Ckk+Chh1Z2mgOfHHnOOSt7iqQkg13TeOlLYevWIzZ5xzvgyiuXQv0jHzlKdUk6JINdzTnClsY1+LG92gCuuw5uugn27IFnPQse//hfaPLqVy/NkZ9wwtIAX9J4fLqjJK0TPt1RoxuyoOrX5EkrZ7Br1QxZUPVr8qSVaxLsSbYl+VqSe5Nc1qJP9WXIgqqLsdJsBgd7kmOAdwHnAKcBFyQ5bWi/Wv8eXlCF2RdU/Zo8aeVajNjPBO6tqq9X1YPAh4DzG/SrMb3udbBpE7zxjUuT3QmcffZMc9+vfz088AB873vwSwf8TZumjwM3Kzlil2bTIthPBr51wPHeyXs/J8n2JAtJFhYXFxucVqvqrW+FU05Zer1r11LI02bu2/lzaXW1CPZDjad+YRxWVTuqar6q5ufm5hqcVmNrMZJ2NC611yLY9wJbDjjeDHynQb8a06WXwje+sfT83DPOgP374aabuPzMnYPnvp0/l1bX4A1KSTYBdwNbgW8DXwL+pKruPNzvuEFJkmZ31DYoVdV+4DXADcAe4MNHCnWtE8cdtzSkftzj4MUvXnp90UU/v6h6lLhZSZpNk/vYq+pfq+o3qurXqupvW/SpkT3wwM9eX3/90kNg4OcXVY8SF1ul2bjzVOuKi63S8gx2Hdrxxy/9/OEP4bTT4P774aqr4MILf7aoes89R6UUF1ul2fh0R0laJ3y6oyRtUAa7JHXGYJekzhjsktQZg12SOmOwS1JnDHZJ6ozBLkmdMdglqTMGuyR1xmCXpM4Y7JLUGYNdkjpjsEtSZwx2SerMoGBP8pYkdyW5Pcl1SU5sVZgkaWWGjth3AqdX1RnA3cDlw0uSJA0xKNir6saq2j85/CKweXhJkqQhWs6xvwL4VMP+JEkrsGm5Bkk+AzzpEB9dUVUfn7S5AtgPXH2EfrYD2wGe/OQnr6hYSdLylg32qjr7SJ8nuQg4F9haR/hm7KraAeyApS+znrFOSdKUlg32I0myDbgUeG5VPdCmJEnSEEPn2N8JnADsTHJrkisb1CRJGmDQiL2qfr1VIZKkNtx5KkmdMdglqTMGuyR1xmCXpM4Y7JLUGYNdkjpjsEtSZwx2SeqMwS5JnTHYJakzBrskdcZgl6TOGOyS1BmDXZI6Y7BLUmcMdknqjMEuSZ0x2CWpMwa7JHWmSbAn+askleQJLfqTJK3c4GBPsgV4PvDN4eVIkoZqMWL/e+ASoBr0JUkaaNOQX05yHvDtqrotyXJttwPbJ4c/SXLHkHOvcU8Avj92Eauo5+vr+drA61vvTp2mUaqOPNBO8hngSYf46Argr4E/qKofJfkGMF9Vy/5DTbJQVfPTFLgeeX3rV8/XBl7fejft9S07Yq+qsw9zgt8CngI8PFrfDOxOcmZVfXfGeiVJjax4KqaqvgI88eHjWUbskqTVM9Z97DtGOu/R4vWtXz1fG3h9691U17fsHLskaX1x56kkdcZgl6TOjBbsSd6S5K4ktye5LsmJY9XSWpI/TnJnkoeSdHPrVZJtSb6W5N4kl41dT0tJ3p9kX6/7K5JsSfK5JHsmfzcvHrumlpIcm+Tfk9w2ub6/Gbum1pIck+TLST65XNsxR+w7gdOr6gzgbuDyEWtp7Q7gj4Cbxy6klSTHAO8CzgFOAy5Ictq4VTX1AWDb2EWsov3A66rqN4FnA6/u7N/fT4DnVdVvA08HtiV59sg1tXYxsGeahqMFe1XdWFX7J4dfZOk++C5U1Z6q+trYdTR2JnBvVX29qh4EPgScP3JNzVTVzcAPxq5jtVTVfVW1e/L6fpYC4uRxq2qnlvz35PARkz/d3BmSZDPwIuC907RfK3PsrwA+NXYROqKTgW8dcLyXjoJhI0lyCvAM4JZxK2lrMlVxK7AP2FlVPV3f21l6JtdD0zQe9KyY5RzpcQRV9fFJmytY+t/Eq1ezltamubbOHOphQN2MiDaKJI8GPgq8tqp+PHY9LVXVT4GnT9brrktyelWt+zWTJOcC+6pqV5Lfn+Z3VjXYD/c4gocluQg4F9ha6+yG+uWurUN7gS0HHG8GvjNSLVqBJI9gKdSvrqqPjV3Paqmq/0ryeZbWTNZ9sANnAecleSFwLPCYJFdV1csP9wtj3hWzDbgUOK+qHhirDk3tS8BTkzwlySOBlwGfGLkmTSlLD3R6H7Cnqt42dj2tJZl7+M66JI8CzgbuGreqNqrq8qraXFWnsPTf3WePFOow7hz7O4ETgJ1Jbk1y5Yi1NJXkD5PsBZ4D/EuSG8auaajJQvdrgBtYWnj7cFXdOW5V7SS5BvgCcGqSvUleOXZNjZ0FXAg8b/Lf262TEWAvTgI+l+R2lgYhO6tq2dsCe+UjBSSpM2vlrhhJUiMGuyR1xmCXpM4Y7JLUGYNdkjpjsEtSZwx2SerM/wGwoFs+Db+qGwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#显示聚类结果\n",
    "#画出聚类结果，每一类用一种颜色\n",
    "colors = ['b','r']\n",
    "\n",
    "n_clusters = 2\n",
    "mb_kmeans = MiniBatchKMeans(n_clusters = n_clusters)\n",
    "mb_kmeans.fit(X_train)\n",
    "\n",
    "y_train_pred = mb_kmeans.labels_\n",
    "cents = mb_kmeans.cluster_centers_#质心\n",
    "\n",
    "for i in range(n_clusters):\n",
    "    index = np.nonzero(y_train_pred==i)[0]\n",
    "    x1 = X_train[index,0]\n",
    "    x2 = X_train[index,1]\n",
    "    y_i = y_train_pred[index]\n",
    "    for j in range(len(x1)):\n",
    "        if j < 20:  #每类打印20个\n",
    "            plt.text(x1[j],x2[j],str(int(y_i[j])),color=colors[i],\\\n",
    "                fontdict={'weight': 'bold', 'size': 5})\n",
    "    #plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12)\n",
    "\n",
    "plt.axis([-2,4,-4,12])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从图中可以看出K=2时的聚类效果也不是太好，红色的点类内散度小，聚类效果比较好。但是蓝色的点类内散度比较大，聚类效果不好。\n",
    "同时，红色的点和蓝色的点类间散度比较小，聚类效果也不好。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
